In this notebook we use the Bayesian Suquential Survey Estimator on an mVAM dataset collected in Yemen starting in March of 2018. We use three datasets:
mVAM data - collected daily over the phone with key demographic variables and the Food Consumption Score indicator
Baseline data (F2F)- Emergency Food Security and Nutrition Assessment (EFSNA) data collected face-to-face in Yemen during ceasefire in fall of 2016. Also contains key demographic variables and the Food Consumption Score indicator
Demographic and Health Survey (DHS) data - 2013 survey of over 17000 households in Yemen collecting key demographic and health indicators. does not have Food Consumption Score, but we use it solely for the purpose of posterior simulation, predicting FCS from the demographic variables, which are presumably as representative as can be without a census
We also use an additional dataset of population by governorate to convert prevalence estimates to absolute values.
##################
### Data Prep ###
##################
library(xlsx)
library(survey)
library(plyr)
#mvam Suvey- data we will use to update baseline model
ymnRspDF <- data.table::fread('ymnData/mVAMYmnObs.csv',sep=';',na.strings='')
#DHS Survey - demographic data to use for posterior simulation
dhsDF <- data.table::fread('ymnData/DHS2013.csv',sep=';',na.strings='')
dhsDF <- as.data.frame(dhsDF)
#EFSNA face-t0-face survey - data to fit baseline model (this dataset is confidential and cannot be shared)
efsnaDF <- data.table::fread('ymnData/ymnEFSNA2016.csv',sep=';',na.strings='')
#Population Data
ymnPopDF <- read.xlsx('C:/Users/gaurav.singhal/Desktop/ymnSeqMdl/cso_2017_population_projection_sexage_disagreggated.xlsx',
sheetIndex=1)
#Create table for IPC Phase
svyYmn <- svydesign(ids=~Cluster_No+ID,
strata=~ADM1_NAME+NULL,
weights=~weight,
pps='brewer',
data=efsnaDF,
nest=TRUE)
ymn.FCG.2016 <- prop.table(svytable(~ADM1_NAME+FCS_Cat,svyYmn),1)
colnames(ymn.FCG.2016) <- c('2016.Severe','2016.Moderate','2016.OK')
ymn.FCG.2016 <- rbind(ymn.FCG.2016,ymn.FCG.2016[c("Hajjah","Hadramaut"),])
rownames(ymn.FCG.2016)[dim(ymn.FCG.2016)[1]-1] <- "Sa'ada"
rownames(ymn.FCG.2016)[dim(ymn.FCG.2016)[1]] <- "Al Maharah"
ymn.FCG.2016 <- as.data.frame(ymn.FCG.2016)
#Assign IPC phase to governorates
ymn.FCG.2016$IPCclass <- 'Emergency'
stressed <- c('Hadramaut','Al Maharah')
ymn.FCG.2016$IPCclass[rownames(ymn.FCG.2016) %in% stressed] <- 'Stressed'
crisis <- c('Al Jawf','Marib','Amran','Amanat Al Asimah','Al Mahwit','Al Hudaydah','Dhamar','Raymah','Ibb')
ymn.FCG.2016$IPCclass[rownames(ymn.FCG.2016) %in% crisis] <- 'Crisis'
ymn.FCG.2016$ADM1_NAME <- rownames(ymn.FCG.2016)
rm(svyYmn,stressed,crisis)
#Filter mVAM dataset for latest questionnaire and Merge
colsmVAM <- c('ObsID','RspID','SvyID','SelectWt','CmbAdjWt','ADM1_NAME','ADM2_NAME',
'rCSI','FCS','FCG','FoodInsecure','isUrban','isIDP',
'HHSizeGrp','HoHSex_is_F','HoHEdu','H2OSrc','FuelSrc','HH_has_Elctrc','Toilet_is_Flush')
mvamDF <- as.data.frame(ymnRspDF[ymnRspDF$SvyID>=181,])
mvamDF <- mvamDF[,colSums(is.na(mvamDF))<0.9*nrow(mvamDF)]
mvamDF <- mvamDF[complete.cases(mvamDF[,colsmVAM]),]
mvamDF <- mvamDF[,append(colsmVAM,setdiff(colnames(mvamDF),colsmVAM))]
mvamDF <- merge(mvamDF,ymn.FCG.2016,all.x=TRUE,by='ADM1_NAME')
mvamDF$FreeResponse <- NULL
mvamDF$FreeResponseEng <- NULL
#Filter DHS dataset for complete.cases and merge
colsDHS <- c('HHID','ADM1_NAME','wgt','HHS_Class','HHS_Score','isUrban',
'HHSizeGrp','HoHSex_is_F','HoHEdu','H2OSrc','FuelSrc','HH_has_Elctrc','Toilet_is_Flush')
dmgDF <- as.data.frame(dhsDF)
dmgDF <- dmgDF[complete.cases(dmgDF[,colsDHS]),]
dmgDF <- dmgDF[,append(colsDHS,setdiff(colnames(dhsDF),colsDHS))]
dmgDF <- merge(dmgDF,ymn.FCG.2016,all.x=TRUE,by='ADM1_NAME')
#Filter EFSNA df for complete.cases and merge
colsEFSNA <- c('ID','ADM1_NAME','wgt','isUrban','FCS','isIDP',
'HHSizeGrp','HoHSex_is_F','HoHEdu','H2OSrc','FuelSrc','HH_has_Elctrc','Toilet_is_Flush')
efsnaDF <- as.data.frame(efsnaDF)
baseDF <- efsnaDF[complete.cases(efsnaDF[,colsEFSNA]),]
baseDF <- baseDF[,colsEFSNA]
baseDF <- merge(baseDF,ymn.FCG.2016,all.x=TRUE,by='ADM1_NAME')
baseDF$FCS[baseDF$FCS<12] <- 12
baseDF$logFCS.2016 <- log(baseDF$FCS)
#add rows for Sa'ada and Al Maharah as they are missing the original baseline data
extDF <- baseDF[baseDF$ADM1_NAME %in% c('Hajjah','Hadramaut'),]
extDF$ADM1_NAME <- revalue(extDF$ADM1_NAME,c('Hajjah'="Sa'ada",'Hadramaut'='Al Maharah'))
baseDF <- rbind(baseDF,extDF)
baseDF$ADM1_NAME <- as.character(baseDF$ADM1_NAME)
rm(extDF)
print('Data Loaded!')
## ADM1_NAME ID wgt isUrban FCS isIDP HHSizeGrp HoHSex_is_F HoHEdu
## 1 Abyan 1 1.036 TRUE 18.0 FALSE (5,8] TRUE noEduc
## 2 Abyan 2 1.038 FALSE 20.5 FALSE (16,100] FALSE primary
## 3 Abyan 732 0.942 FALSE 22.5 FALSE (5,8] TRUE noEduc
## 4 Abyan 3528 0.930 FALSE 21.0 FALSE (5,8] FALSE noEduc
## 5 Abyan 1464 1.038 TRUE 22.0 FALSE (8,12] FALSE higher
## 6 Abyan 3409 0.942 FALSE 42.0 FALSE (5,8] FALSE secondary
## H2OSrc FuelSrc HH_has_Elctrc Toilet_is_Flush 2016.Severe
## 1 piped wood FALSE FALSE 0.1305556
## 2 protectSrc wood FALSE TRUE 0.1305556
## 3 protectSrc gas FALSE TRUE 0.1305556
## 4 tankTruck wood FALSE TRUE 0.1305556
## 5 protectSrc gas FALSE TRUE 0.1305556
## 6 protectSrc wood FALSE TRUE 0.1305556
## 2016.Moderate 2016.OK IPCclass logFCS.2016
## 1 0.3527778 0.5166667 Emergency 2.890372
## 2 0.3527778 0.5166667 Emergency 3.020425
## 3 0.3527778 0.5166667 Emergency 3.113515
## 4 0.3527778 0.5166667 Emergency 3.044522
## 5 0.3527778 0.5166667 Emergency 3.091042
## 6 0.3527778 0.5166667 Emergency 3.737670
## ADM1_NAME isUrban FCS isIDP HHSizeGrp HoHSex_is_F HoHEdu H2OSrc
## 1 Abyan FALSE 57 FALSE (5,8] FALSE secondary piped
## 2 Abyan FALSE 77 FALSE (8,12] FALSE secondary piped
## 3 Abyan FALSE 94 FALSE (5,8] FALSE secondary piped
## 4 Abyan FALSE 93 FALSE (8,12] FALSE secondary piped
## 5 Abyan FALSE 63 FALSE (2,5] FALSE higher other
## 6 Abyan FALSE 48 FALSE (5,8] FALSE secondary piped
## FuelSrc HH_has_Elctrc Toilet_is_Flush 2016.Severe 2016.Moderate
## 1 gas TRUE TRUE 0.1305556 0.3527778
## 2 gas TRUE TRUE 0.1305556 0.3527778
## 3 gas TRUE TRUE 0.1305556 0.3527778
## 4 gas TRUE TRUE 0.1305556 0.3527778
## 5 gas TRUE TRUE 0.1305556 0.3527778
## 6 gas TRUE TRUE 0.1305556 0.3527778
## 2016.OK IPCclass SelectWt ObsDate SvyID
## 1 0.5166667 Emergency 0.3333333 2018-05-12 183
## 2 0.5166667 Emergency 0.5000000 2018-05-10 183
## 3 0.5166667 Emergency 0.3333333 2018-06-11 184
## 4 0.5166667 Emergency 0.5000000 2018-04-16 182
## 5 0.5166667 Emergency 0.5000000 2018-05-08 183
## 6 0.5166667 Emergency 0.3333333 2018-04-01 182
## ADM1_NAME wgt isUrban HHSizeGrp HoHSex_is_F HoHEdu H2OSrc FuelSrc
## 1 Ibb 1.026 TRUE (8,12] FALSE primary piped gas
## 2 Ibb 1.026 TRUE (2,5] FALSE primary piped gas
## 3 Ibb 1.026 TRUE (5,8] FALSE higher piped gas
## 4 Ibb 1.026 TRUE (5,8] TRUE noEduc piped gas
## 5 Ibb 1.026 TRUE (5,8] FALSE noEduc piped gas
## 6 Ibb 1.026 TRUE (2,5] FALSE noEduc piped gas
## HH_has_Elctrc Toilet_is_Flush
## 1 TRUE TRUE
## 2 TRUE TRUE
## 3 TRUE TRUE
## 4 TRUE TRUE
## 5 TRUE TRUE
## 6 TRUE TRUE
Next we construct a multi-level random effect model on the baseline F2F data, that predicts log(FCS) on the host of available demographic and socioeconomic information. Effectively the models are dividing the population into a myriad of cells demograpahically decohering the log(FCS) estimates such that we are estimating the marginal distributions of log(FCS) by demographic covariate. Those covariates that are not independent of each other (such as IPC phase and governorate) are appropriately nested such that the nested variable is conditionally independent on the other). The joint probabilities of the marginal distributions yield the individual demographic-cell value, allowing sparse cells (those with few observations) to borrow strength from neighboring cells.
First we use lme4 to fit a non-Bayesian model based on maximizing the Random-Effects Maximum Likelihood criterion. Those estimates are then set as priors to the same model but estimated using rstanarm’s Markov Chain Monte Carlo estimator. The results should largely be the same, but the latter model can be Bayes’ updated with new information.
#####################################
### Create initial baseline model ###
#####################################
library(lme4)
library(rstanarm)
#create model formula
base.fmla <- as.formula(log(FCS) ~ HoHSex_is_F+HH_has_Elctrc+Toilet_is_Flush+isUrban+
(1|HHSizeGrp)+(1|HoHEdu)+(1|H2OSrc)+(1|FuelSrc)+(1|IPCclass/ADM1_NAME))
#(A) fit REML model using lme4 on the baseline data
base.mdl <- lmer(base.fmla,data=baseDF,weights=wgt)
#(B) fit Bayiesn model using lme model values as priors. We inflate the estimated variance to give the MCMC sampler more 'search room'
base.mdl.stan <- stan_lmer(base.fmla,data=baseDF,weights=wgt,
iter=3000,chains=4,cores=4,thin=2,control=list(adapt_delta=.965),QR=TRUE,
prior_intercept = normal(offset.mdl@beta[1],sqrt(vcov(base.mdl)[1,1])*10))
rm(base.mdl)
The above model allows us to correct for differences in demographics with our mVAM surveys, as we are using the mVAM information to solely estimate the mean and variances of the demographic marginal distributions with respect to log(FCS). However, as mVAM uses a different modality–phone vs face-to-face–there exists a second form bias whereby respondents’ estimates change due to survey mode. We eliminate this bias with the assumption that is is fixed (not dependent on time) and dependent on the respondents’ socio-demographic profile as well. Therefore we take an mVAM dataset from the same time period, use the above model to predict FCS (out-of-sample) using the socio-demographic variables in the mVAM dataset, and compute its’ errors vis-a-vis the reported mVAM FCS. Those errors are then the respondents’ theorized estimate of ‘mode-effect.’ We regress the same hierarchical random-effects model on the errors to then predict ‘mode-effect’ on new respondents. Ergo, when new mVAM data arrives. We first run the mode-effect correction model to adjust the respondents’ corresponding log(FCS) for mode-bias. As the mode-effect itself is a random variable, we must use the bootstrap technique to compute subsequent variances.
#########################################
### Create mode-effect (offset) model ###
#########################################
##(2) fit offset model to correct for mode effect using the residuals of the baseline model simulated on mVAM data
#(A) massage mvam data first because Sa'ada and Al Maharah governorates missing from baseline survey
### we assume Sa'ada=Hajjah and Al Maharah="Hadramaut for this purpose
mvamDF$ADM1_NAME <- as.factor(mvamDF$ADM1_NAME)
mvamDF$ADM1_NAME_ACTUAL <- mvamDF$ADM1_NAME
mvamDF$ADM1_NAME <- droplevels(revalue(mvamDF$ADM1_NAME,c("Sa'ada"="Hajjah","Al Maharah"="Hadramaut")))
#(B) now perform posterior simulations (i.e. predictions) of the previous baseline model on the new mVAM data
mvamDF$logFCS.prior <- rowMeans(t(posterior_predict(base.mdl.stan,newdata=mvamDF[,c(colsmVAM,'IPCclass')],
draws=500)))
#(C) now as know actual reported log(FCS) from the mVAM data, compute differences vis-a-vis the above predictions mvamDF$logFCS.diff <- with(mvamDF,log(FCS)-logFCS.prior)
#(D) revert governorates
mvamDF$ADM1_NAME <- mvamDF$ADM1_NAME_ACTUAL
mvamDF$ADM1_NAME_ACTUAL <- NULL
### Model Formula
offset.fmla <- as.formula(logFCS.diff ~ HoHSex_is_F+HH_has_Elctrc+Toilet_is_Flush+isUrban+
(1|HHSizeGrp)+(1|HoHEdu)+(1|H2OSrc)+(1|FuelSrc)+(1|IPCclass/ADM1_NAME))
#NOTE: SvyID==181 is a filter as to use only 1 round of information corresponding to mVAM's closest available time period to when the F2F survey was completed
#(E) first fit lme model as prior for rstanarm model
offset.mdl <- lmer(offset.fmla,data=mvamDF[mvamDF$SvyID==181,],weights=SelectWt)
#(F) fit rstanarm with lme prior
offset.mdl.stan <- stan_lmer(offset.fmla,weights=SelectWt,
iter=3000,chains=4,cores=4,thin=2,control=list(adapt_delta=.965),QR=TRUE,
prior_intercept = normal(offset.mdl@beta[1],sqrt(vcov(offset.mdl)[1,1])*10))
rm(offset.mdl)
With these two models in place we can now perform Bayesian sequential updates as new mVAM data arrives upon the baseline model that predicts log(FCS) as a function of socio-demographic variables. This is done as a batch process using a rolling window over the data of a specified time length (usually 30 days). The model for the new moving window is computed by performing a Bayesian sequential update, that is using the previous parameter estimates as a prior for the new parameter estimates, on the previous window’s model using only the data from the current moving window. This is repeated for the whole sequence moving windows in the dataset.
An important note is that at the first Bayesian sequential update, the model is adjusted to include the estimates from the baseline model, that is we add the term log(FCS)_baseline to the right-hand-side of the regression equation, i.e. log(FCS)_mvam_t ~ {sociodemographic random effect variables} + log(FCS)_baseline where log(FCS)_baseline are the predictions from the baseline model. Hence the priors for the random-effects come from the offset model, not the baseline model. And the random effects are effectively estimating the difference between the mVAM results and the baseline results, just as the offset model does. We call this set-up the ‘cross-sectional’ version of the Bayesian sequential update process. Another option is to use the previous time-window’s model estimate of log(FCS) in the RHS, i.e. log(FCS)_mvam_t ~ {sociodemographic random effect variables} + log(FCS)_mvam_t-1. We call this set-up to be the ‘auto-regressive’ version.
Furthermore, the round of information used to compute the above offset is discarded. The reason for doing so is obvious; we will be effectively returning the results from the F2F survey.
#source code for Bayesian Sequential Survey Estimator
#assume current working directory is repository root
source('./R/seqSvyEst.R')
#Estimate models over time windows (using cross-sectional modelling approach)
seq.mdls <- seq.estimator.XS(mvamDF[,c(colsmVAM,'IPCclass','logFCS.prior')],base.mdl.stan,mvamDF$ObsDate)
Now with the posterior (updated with mvam data) models in hand for each time-step we simulate the results on the DHS dataset, providing the most representative breakdown of the socio-demographic characteristics of our population withstanding the existence of a proper census.
The function returns the probability distribution function and quantile function of FCS by strata by time-window
#Return estimates via simulating models on DHS data for each window
rslt.DHS <- mdl.simulator(seq.mdls,offset.mdl.stan,dmgDF[,c(colsDHS,'IPCclass')],'ADM1_NAME','wgt')
## ADM1_NAME Start.Date End.Date FCSMean CI.Hi.FCS CI.Lo.FCS PctPoor
## 1 Abyan 2018-02-26 2018-03-28 39.40743 45.24349 34.14754 0.4789665
## 2 Abyan 2018-03-05 2018-04-04 43.62123 49.97993 38.22887 0.4187277
## 3 Abyan 2018-05-14 2018-06-13 48.04118 53.94819 42.60540 0.3597973
## 4 Abyan 2018-04-23 2018-05-23 42.66908 47.99200 37.94645 0.4219921
## 5 Abyan 2018-03-12 2018-04-11 44.94083 50.48076 39.60920 0.3933568
## 6 Abyan 2018-03-19 2018-04-18 45.40297 50.20391 40.32024 0.3806525
## CI.Hi.Poor CI.Lo.Poor PctBrdr CI.Hi.Brdr CI.Lo.Brdr Pop NumPoor
## 1 0.5544041 0.4090557 0.6811747 0.7491103 0.6128763 568000 272053.0
## 2 0.4888967 0.3494521 0.6292766 0.6973284 0.5623274 568000 237837.3
## 3 0.4194724 0.3035233 0.5748220 0.6338494 0.5109731 568000 204364.9
## 4 0.4845644 0.3647365 0.6362603 0.6980079 0.5778629 568000 239691.5
## 5 0.4631563 0.3301679 0.6098235 0.6738962 0.5442604 568000 223426.6
## 6 0.4396604 0.3255414 0.6003439 0.6584308 0.5460853 568000 216210.6
## NumBrdr
## 1 386907.2
## 2 357429.1
## 3 326498.9
## 4 361395.9
## 5 346379.8
## 6 340995.3
## # A tibble: 6 x 201
## # Groups: ADM1_NAME, End.Date [6]
## ADM1_NAME End.Date ` 0.5%` ` 1.0%` ` 1.5%` ` 2.0%` ` 2.5%` ` 3.0%`
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abyan 2018-03-28 22.4 22.6 22.9 23.1 23.2 23.6
## 2 Abyan 2018-04-04 24.7 25.3 25.5 25.9 26.1 26.3
## 3 Abyan 2018-04-11 25.3 25.9 26.3 26.6 27.0 27.2
## 4 Abyan 2018-04-18 26.1 26.5 27.1 27.3 27.6 27.7
## 5 Abyan 2018-04-25 24.6 25.2 25.6 26.1 26.3 26.6
## 6 Abyan 2018-05-02 24.0 24.3 24.3 24.4 24.5 24.7
## # ... with 193 more variables: ` 3.5%` <dbl>, ` 4.0%` <dbl>, `
## # 4.5%` <dbl>, ` 5.0%` <dbl>, ` 5.5%` <dbl>, ` 6.0%` <dbl>, `
## # 6.5%` <dbl>, ` 7.0%` <dbl>, ` 7.5%` <dbl>, ` 8.0%` <dbl>, `
## # 8.5%` <dbl>, ` 9.0%` <dbl>, ` 9.5%` <dbl>, `10.0%` <dbl>,
## # `10.5%` <dbl>, `11.0%` <dbl>, `11.5%` <dbl>, `12.0%` <dbl>,
## # `12.5%` <dbl>, `13.0%` <dbl>, `13.5%` <dbl>, `14.0%` <dbl>,
## # `14.5%` <dbl>, `15.0%` <dbl>, `15.5%` <dbl>, `16.0%` <dbl>,
## # `16.5%` <dbl>, `17.0%` <dbl>, `17.5%` <dbl>, `18.0%` <dbl>,
## # `18.5%` <dbl>, `19.0%` <dbl>, `19.5%` <dbl>, `20.0%` <dbl>,
## # `20.5%` <dbl>, `21.0%` <dbl>, `21.5%` <dbl>, `22.0%` <dbl>,
## # `22.5%` <dbl>, `23.0%` <dbl>, `23.5%` <dbl>, `24.0%` <dbl>,
## # `24.5%` <dbl>, `25.0%` <dbl>, `25.5%` <dbl>, `26.0%` <dbl>,
## # `26.5%` <dbl>, `27.0%` <dbl>, `27.5%` <dbl>, `28.0%` <dbl>,
## # `28.5%` <dbl>, `29.0%` <dbl>, `29.5%` <dbl>, `30.0%` <dbl>,
## # `30.5%` <dbl>, `31.0%` <dbl>, `31.5%` <dbl>, `32.0%` <dbl>,
## # `32.5%` <dbl>, `33.0%` <dbl>, `33.5%` <dbl>, `34.0%` <dbl>,
## # `34.5%` <dbl>, `35.0%` <dbl>, `35.5%` <dbl>, `36.0%` <dbl>,
## # `36.5%` <dbl>, `37.0%` <dbl>, `37.5%` <dbl>, `38.0%` <dbl>,
## # `38.5%` <dbl>, `39.0%` <dbl>, `39.5%` <dbl>, `40.0%` <dbl>,
## # `40.5%` <dbl>, `41.0%` <dbl>, `41.5%` <dbl>, `42.0%` <dbl>,
## # `42.5%` <dbl>, `43.0%` <dbl>, `43.5%` <dbl>, `44.0%` <dbl>,
## # `44.5%` <dbl>, `45.0%` <dbl>, `45.5%` <dbl>, `46.0%` <dbl>,
## # `46.5%` <dbl>, `47.0%` <dbl>, `47.5%` <dbl>, `48.0%` <dbl>,
## # `48.5%` <dbl>, `49.0%` <dbl>, `49.5%` <dbl>, `50.0%` <dbl>,
## # `50.5%` <dbl>, `51.0%` <dbl>, `51.5%` <dbl>, `52.0%` <dbl>,
## # `52.5%` <dbl>, `53.0%` <dbl>, …
## # A tibble: 6 x 72
## # Groups: ADM1_NAME, End.Date [6]
## ADM1_NAME End.Date `11` `12` `13` `14` `15`
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abyan 2018-03-28 0. 5.26e-16 1.11e-15 3.62e-16 1.27e-16
## 2 Abyan 2018-04-04 0. 5.02e-16 2.01e-15 0. 0.
## 3 Abyan 2018-04-11 1.84e-15 0. 4.44e-15 1.53e-15 0.
## 4 Abyan 2018-04-18 0. 7.60e-16 5.36e-16 4.65e-15 3.27e-15
## 5 Abyan 2018-04-25 7.54e-16 2.34e-15 1.39e-15 6.25e-15 3.70e-15
## 6 Abyan 2018-05-02 0. 1.11e-15 1.53e-15 0. 9.24e-16
## # ... with 65 more variables: `16` <dbl>, `17` <dbl>, `18` <dbl>,
## # `19` <dbl>, `20` <dbl>, `21` <dbl>, `22` <dbl>, `23` <dbl>,
## # `24` <dbl>, `25` <dbl>, `26` <dbl>, `27` <dbl>, `28` <dbl>,
## # `29` <dbl>, `30` <dbl>, `31` <dbl>, `32` <dbl>, `33` <dbl>,
## # `34` <dbl>, `35` <dbl>, `36` <dbl>, `37` <dbl>, `38` <dbl>,
## # `39` <dbl>, `40` <dbl>, `41` <dbl>, `42` <dbl>, `43` <dbl>,
## # `44` <dbl>, `45` <dbl>, `46` <dbl>, `47` <dbl>, `48` <dbl>,
## # `49` <dbl>, `50` <dbl>, `51` <dbl>, `52` <dbl>, `53` <dbl>,
## # `54` <dbl>, `55` <dbl>, `56` <dbl>, `57` <dbl>, `58` <dbl>,
## # `59` <dbl>, `60` <dbl>, `61` <dbl>, `62` <dbl>, `63` <dbl>,
## # `64` <dbl>, `65` <dbl>, `66` <dbl>, `67` <dbl>, `68` <dbl>,
## # `69` <dbl>, `70` <dbl>, `71` <dbl>, `72` <dbl>, `73` <dbl>,
## # `74` <dbl>, `75` <dbl>, `76` <dbl>, `77` <dbl>, `78` <dbl>,
## # `79` <dbl>, `80` <dbl>
Now with results in hand, we can make some pretty plots :)
library(dplyr)
library(ggplot2)
library(reshape2)
library(scales)
library(stringr)
library(stringi)
#function to rowbind when column names are different
force_bind = function(df1, df2) {
colnames(df2) = colnames(df1)
bind_rows(df1, df2)
}
ggplot(data=rslt.DHS$str.est %>% group_by(ADM1_NAME) %>% mutate(avgFCS=mean(FCSMean)),
mapping=aes(x=End.Date,FCSMean,CI.Lo.FCS,CI.Hi.FCS,color=avgFCS,fill=avgFCS))+
geom_line(aes(y = FCSMean))+
geom_ribbon(aes(ymin=CI.Lo.FCS, ymax=CI.Hi.FCS),alpha=0.5,color=NA)+facet_wrap(~ADM1_NAME)+
scale_fill_gradient(low='red',high='blue')+scale_colour_gradient(low='red',high='blue')+
scale_x_date(breaks = pretty_breaks(14))+
scale_y_continuous(minor_breaks = seq(80,80,5),breaks = seq(30,80,10))+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid.major.y = element_line(colour="grey", size=0.5))
ggplot(force_bind(cbind(rslt.DHS$str.est[,c(1,3,7,8,9)],'threshold'='poor'),
cbind(rslt.DHS$str.est[,c(1,3,10,11,12)],'threshold'='poorbrdr')),
mapping=aes(x=End.Date,PctPoor,CI.Hi.Poor,CI.Lo.Poor,fill=threshold,color=threshold))+
geom_line(aes(y = PctPoor))+
geom_ribbon(aes(ymin=CI.Lo.Poor,ymax=CI.Hi.Poor),alpha=0.5,show.legend=FALSE,colour=NA)+facet_wrap(~ADM1_NAME)+
scale_x_date(breaks = pretty_breaks(14))+
scale_y_continuous(minor_breaks = seq(15,75,5)/100,breaks = seq(15,75,10)/100)+
theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.grid.major.y = element_line(colour="grey", size=0.5))
library(plotly)
df <- rslt.DHS$pdf.est %>% filter(ADM1_NAME=='Yemen') %>% select(-ADM1_NAME)
plot_ly(x=df$End.Date,y=c(20:60),
z=t(as.matrix(df[,as.character(c(20:60))]))/10000,type='contour',autocontour=TRUE) %>%
layout(xaxis=list(title='Date'),yaxis=list(title='FCS'),title='Evolution of FCS distribution from Apr-June')
rm(df)
#convert spatial polygons dataframe
shp.DF <- fortify(ymnGDF) %>%
left_join(data_frame(id=rownames(ymnGDF@data), name=ymnGDF@data$ADM1_NAME)) %>%
select(-id) %>% rename(id=name)
#assemble dataframe
dat.DF <- rslt.Base$str.est[rslt.Base$str.est$ADM1_NAME!='Yemen',c('ADM1_NAME','End.Date','PctPoor','NumPoor')]
colnames(dat.DF)[1] <- 'id'
#reformat so prevalence is in buckets of 15% to 55% by 5% increments
breaks.prv <- seq(15,55,5)
breaks.lbl <- sprintf("%2.1f-%s", breaks.prv, percent(lead(breaks.prv/100))) %>%
stri_replace_all_regex(c("^0.0", "-NA%"), c("0", "%"), vectorize_all=FALSE) %>% head(-1)
dat.DF <- dat.DF %>% mutate(`PoorFCS%`=cut(PctPoor,breaks.prv/100,breaks.lbl))
#plot
ggplot()+
geom_polygon(data=shp.DF,aes(x=long,y=lat, group=group),fill='white',color='#7f7f7f',size=0.15)+
geom_map(data=dat.DF, map=shp.DF,aes(map_id=id, fill=`PoorFCS%`),color="#7f7f7f", size=0.15)+
scale_fill_manual(values=colorRampPalette(c("yellow", "red"))(length(breaks.lbl)))+
guides(fill=guide_legend(override.aes=list(colour=NA)))+
labs(title="% Poor FCS by Week")+
facet_wrap(~End.Date)+
theme(legend.position="top")
rm(breaks.lbl,breaks.prv)
breaks.num <- seq(0,1500000,100000)
breaks.lbl <- sprintf("%2.0f-%2.0fk", breaks.num/1000, lead(breaks.num)/1000) %>% head(-1)
dat.DF <- dat.DF %>% mutate(`NumPoorFCS`=cut(NumPoor,breaks.num,breaks.lbl))
ggplot()+
geom_polygon(data=shp.DF,aes(x=long,y=lat, group=group),fill='white',color='#7f7f7f',size=0.15)+
geom_map(data=dat.DF, map=shp.DF,aes(map_id=id, fill=`NumPoorFCS`),color="#7f7f7f", size=0.15)+
scale_fill_manual(values=colorRampPalette(c("yellow", "red"))(length(breaks.lbl)))+
guides(fill=guide_legend(override.aes=list(colour=NA)))+
labs(title="Num Poor FCS by Week")+
facet_wrap(~End.Date)+
theme(legend.position="top")
rm(breaks.lbl,breaks.num)
rm(dat.DF)
Now we compare the above results to what we get through our traditional mVAM estimates. These estimates use the standard Horowtiz-Thompson estimator (i.e. weighted average) for stratified random sample survey designs with first-stage selection weights determined by number of phones owned by the household and a final post-stratification weighting by IDP status. Each survey round is discrete.
For ground-truth we use the governorate-level food-distribution targets for May 2018. These are typically tabulated by the estimated prevalence rate of poor food consumption coming from ad-hoc face-to-face convenience surveys conducted quarterly. Hence, for comparison we use the April mVAM data round and the Bayesian Sequential Estimates from the period beteween 2nd of April and 2nd of May.
##dataset preloaded as cmpr.DF
ggplot()+
geom_polygon(data=shp.DF,aes(x=long,y=lat, group=group),fill='white',color='#7f7f7f',size=0.15)+
geom_map(data=cmpr.DF[cmpr.DF$Type!='EFSA',],
map=shp.DF,aes(map_id=id, fill=NumPop),color="#7f7f7f", size=0.15)+
scale_fill_gradient(low='blue',high='red',
guide = guide_legend(
label.theme = element_text(angle = 90)))+
guides(fill=guide_legend(title='Estimated Population'))+
labs(title="Num Poor FCS by Source")+
facet_wrap(~Type)+
theme(legend.position="top")
ggplot()+
geom_polygon(data=shp.DF,aes(x=long,y=lat, group=group),fill='white',color='#7f7f7f',size=0.15)+
geom_map(data=cmpr.DF[!(cmpr.DF$Type %in% c('Targeted','EFSA')),],
map=shp.DF,aes(map_id=id, fill=PctDiff*100),color="#7f7f7f", size=0.15)+
scale_fill_gradient2(low='red',high='blue',mid='white',
guide = guide_legend(
label.theme = element_text(angle = 90)))+
guides(fill=guide_legend(title='%Diff'))+
labs(title="% Difference with May Food Distribution Targets")+
facet_wrap(~Type,ncol=2)+
theme(legend.position="top")